{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gzip\n",
    "import os\n",
    "import pickle\n",
    "\n",
    "import numpy as np\n",
    "from sklearn import svm\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.linear_model import SGDClassifier\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaled_balanced_fit_1_test = np.load(gzip.open('scaled_balanced_fit_1_test.npy.gz', 'rb'))\n",
    "scaled_balanced_fit_10_test = np.load(gzip.open('scaled_balanced_fit_10_test.npy.gz', 'rb'))\n",
    "scaled_balanced_fit_10_test = np.load(gzip.open('scaled_balanced_fit_10_test.npy.gz', 'rb'))\n",
    "\n",
    "scaled_balanced_fit_cv = np.load(gzip.open('scaled_balanced_fit_cv.npy.gz', 'rb'))\n",
    "\n",
    "ordered_features = pickle.load(open('ordered_features', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(86882, 12)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaled_balanced_fit_10_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_1_X = scaled_balanced_fit_1_test[:,:-1]\n",
    "train_1_Y = scaled_balanced_fit_1_test[:,-1]\n",
    "train_10_X = scaled_balanced_fit_10_test[:,:-1]\n",
    "train_10_Y = scaled_balanced_fit_10_test[:,-1]\n",
    "train_100_X = scaled_balanced_fit_10_test[:,:-1]\n",
    "train_100_Y = scaled_balanced_fit_10_test[:,-1]\n",
    "\n",
    "cv_X = scaled_balanced_fit_cv[:,:-1]\n",
    "cv_Y = scaled_balanced_fit_cv[:,-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.8960709467392955"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_svm_1 = svm.SVC(kernel='linear')\n",
    "my_svm_1.fit(train_1_X, train_1_Y)\n",
    "my_svm_1.score(cv_X, cv_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_svm_10 = svm.SVC(kernel='linear')\n",
    "my_svm_10.fit(train_10_X, train_10_Y)\n",
    "my_svm_10.score(cv_X, cv_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_svm_1_rbf = svm.SVC(C=10,kernel='rbf')\n",
    "my_svm_1_rbf.fit(train_1_X, train_1_Y)\n",
    "my_svm_1_rbf.score(cv_X, cv_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = my_svm_10_rbf.predict(cv_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "join = np.array([predictions, cv_Y])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "my_svm_10_C = svm.SVC(C=10, kernel='rbf')\n",
    "my_svm_10_C.fit(train_10_X, train_10_Y)\n",
    "my_svm_10_C.score(cv_X, cv_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['mean_fit_time',\n",
       " 'mean_score_time',\n",
       " 'mean_test_score',\n",
       " 'mean_train_score',\n",
       " 'param_C',\n",
       " 'param_kernel',\n",
       " 'params',\n",
       " 'rank_test_score',\n",
       " 'split0_test_score',\n",
       " 'split0_train_score',\n",
       " 'split1_test_score',\n",
       " 'split1_train_score',\n",
       " 'split2_test_score',\n",
       " 'split2_train_score',\n",
       " 'split3_test_score',\n",
       " 'split3_train_score',\n",
       " 'split4_test_score',\n",
       " 'split4_train_score',\n",
       " 'std_fit_time',\n",
       " 'std_score_time',\n",
       " 'std_test_score',\n",
       " 'std_train_score']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameters = {'kernel': ['linear', 'rbf'], 'C':[0.1, 1, 10]}\n",
    "svc = svm.SVC()\n",
    "clf = GridSearchCV(estimator=svc, param_grid=parameters, cv=5, n_jobs=8)\n",
    "clf.fit(train_1_X, train_1_Y)\n",
    "clf.best_score_ clf.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'GridSearchCV' object has no attribute 'best_params'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-7-bc396077d6d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mclf2\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0msvc2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam_grid\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m8\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mclf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_1_X\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtrain_1_Y\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mclf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclf2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbest_score\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m: 'GridSearchCV' object has no attribute 'best_params'"
     ]
    }
   ],
   "source": [
    "parameters = {'kernel': ['poly'], 'C':[10, 100], 'degree': [2, 3, 4, 5]}\n",
    "svc2 = svm.SVC()\n",
    "clf2 = GridSearchCV(estimator=svc2, param_grid=parameters, cv=5, n_jobs=8)\n",
    "clf2.fit(train_1_X, train_1_Y)\n",
    "clf2.best_params_, clf2.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "({'C': 100, 'degree': 3, 'kernel': 'poly'}, 0.9019745297347821)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf2.best_params_, clf2.best_score_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.81434747049889, {'C': 0.1, 'kernel': 'sigmoid'})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parameters = {'kernel': ['sigmoid'], 'C':[0.1, 1, 10]}\n",
    "svc3 = svm.SVC()\n",
    "clf3 = GridSearchCV(estimator=svc3, param_grid=parameters, cv=5, n_jobs=8)\n",
    "clf3.fit(train_1_X, train_1_Y)\n",
    "clf3.best_score_, clf3.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaled_fit = np.load(gzip.open('scaled_fit.npy.gz', 'rb'))\n",
    "scaled_X = scaled_fit[:,:-1]\n",
    "scaled_Y = scaled_fit[:,-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9176841132718098"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "my_svm_1_rbf.score(scaled_X, scaled_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise',\n",
       "       estimator=SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n",
       "       eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n",
       "       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,\n",
       "       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,\n",
       "       shuffle=True, tol=None, verbose=0, warm_start=False),\n",
       "       fit_params=None, iid=True, n_jobs=3,\n",
       "       param_grid={'alpha': [1e-05, 0.0001, 0.001], 'loss': ['hinge'], 'tol': [0.0001, 0.001, 0.01]},\n",
       "       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
       "       scoring=None, verbose=0)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sgd = SGDClassifier()\n",
    "\n",
    "parameters = {'alpha': [0.00001, 0.0001, 0.001], 'loss':['hinge'], 'tol': [1e-3]}\n",
    "csgd = GridSearchCV(estimator=sgd, param_grid=parameters, cv=5, n_jobs=3)\n",
    "\n",
    "csgd.fit(scaled_X, scaled_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(0.9512203307398348, {'alpha': 1e-05, 'loss': 'hinge', 'tol': 0.001})"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "csgd.best_score_, csgd.best_params_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9198535228997008"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sgd.score(scaled_X, scaled_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/tra/anaconda3/lib/python3.6/site-packages/sklearn/linear_model/stochastic_gradient.py:128: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.\n",
      "  \"and default tol will be 1e-3.\" % type(self), FutureWarning)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,\n",
       "       eta0=0.0, fit_intercept=True, l1_ratio=0.15,\n",
       "       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,\n",
       "       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,\n",
       "       shuffle=True, tol=None, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.kernel_approximation import RBFSampler\n",
    "rbf_feature = RBFSampler(gamma=1, random_state=1)\n",
    "X_features = rbf_feature.fit_transform(train_100_X)\n",
    "clfxx = SGDClassifier()\n",
    "clfxx.fit(X_features, train_100_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.627285"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "fff = rbf_feature.transform(scaled_X[:200000])\n",
    "clfxx.score(fff, scaled_Y[:200000])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "forest_estimator = RandomForestClassifier(n_estimators=100)\n",
    "forest_fit = forest_estimator.fit(train_10_X, train_10_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9237544073153457"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "forest_fit.score(scaled_X, scaled_Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
